# hail imports
from pprint import pprint
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.models import Span
from bokeh.plotting import figure, show, output_file
import pandas as pd
import os , sys, time
import numpy as np
import hail as hl
from bokeh.io import output_notebook, show
hl.init()
output_notebook()
#####################################
# importing maually parsed vcf file
#####################################
geno = hl.import_vcf('/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/manually_parsed_2_snp_annotated.vcf')
#geno = hl.read_matrix_table('/media/CERBADATA/vc_proj/ALL_COHORTS.jc.vcf.FILTERED.SELECTED.mt')
# for resetting the variable, if something goes wrong along the way
#%reset_selective geno
geno.describe()
# Sample QC
geno = hl.sample_qc(geno)
sn = hl.plot.histogram(geno.info.AN)
show(sn)
# qc before filtering
# sample_qc plots
n_hom_var = hl.plot.histogram(geno.sample_qc.n_hom_var)
n_singleton = hl.plot.histogram(geno.sample_qc.n_singleton)
n_snp = hl.plot.histogram(geno.sample_qc.n_snp)
n_transition = hl.plot.histogram(geno.sample_qc.n_transition)
n_transversion = hl.plot.histogram(geno.sample_qc.n_transversion)
n_star = hl.plot.histogram(geno.sample_qc.n_star)
r_ins_del = hl.plot.histogram(geno.sample_qc.r_insertion_deletion)
n_delet = hl.plot.histogram(geno.sample_qc.n_deletion)
n_insert = hl.plot.histogram(geno.sample_qc.n_insertion)
sample_call_rate = hl.plot.histogram(geno.sample_qc.call_rate)
titv = hl.plot.histogram(geno.sample_qc.r_ti_tv)
hethom = hl.plot.histogram(geno.sample_qc.r_het_hom_var)
nonrefs = hl.plot.histogram(geno.sample_qc.n_non_ref)
show(nonrefs)
show(hethom)
show(titv)
show(sample_call_rate)
show(n_insert)
show(n_delet)
show(r_ins_del)
show(n_hom_var)
show(n_singleton)
show(n_snp)
show(n_transition)
show(n_transversion)
show(n_star)
# variant qc
geno = hl.variant_qc(geno)
# variant qc fields
call_rate = hl.plot.histogram(geno.variant_qc.call_rate)
n_called = hl.plot.histogram(geno.variant_qc.n_called)
n_not_called = hl.plot.histogram(geno.variant_qc.n_not_called)
n_het = hl.plot.histogram(geno.variant_qc.n_het)
n_non_ref = hl.plot.histogram(geno.variant_qc.n_non_ref)
het_freq_hwe = hl.plot.histogram(geno.variant_qc.het_freq_hwe)
p_value_hwe = hl.plot.histogram(geno.variant_qc.p_value_hwe)
show(call_rate)
show(n_called)
show(n_not_called)
show(n_het)
show(n_non_ref)
show(het_freq_hwe)
show(p_value_hwe)
##### filtering step
# mean sample_GQ >= 30
# sample r_het_hom_vars in [1,2]
# sample r_ti_tv > 2
# sample r_insertion_deletion in [0.5, 1.2]
#geno.filter_cols(geno.sample_qc.dp_stats.mean >= 4)
geno = geno.filter_cols(geno.sample_qc.r_het_hom_var > 1)
geno = geno.filter_cols(geno.sample_qc.r_het_hom_var < 2)
geno = geno.filter_cols(geno.sample_qc.r_ti_tv >2)
geno = geno.filter_cols(geno.sample_qc.r_insertion_deletion >0.5)
geno = geno.filter_cols(geno.sample_qc.r_insertion_deletion <1.2)
geno = geno.filter_cols(geno.sample_qc.gq_stats.mean >= 30)
# All qc plots\
# sample_qc
geno = hl.sample_qc(geno)
r_ins_del_a = hl.plot.histogram(geno.sample_qc.r_insertion_deletion)
titv_a = hl.plot.histogram(geno.sample_qc.r_ti_tv)
hethom_a = hl.plot.histogram(geno.sample_qc.r_het_hom_var)
qc_jgt_a = hl.plot.histogram(geno.sample_qc.gq_stats.mean, range=(10,70), bins=100, title='Mean Sample GQ Histogram for JGT', legend='Mean Sample GQ')
show(r_ins_del_a)
show(titv_a)
show(hethom_a)
show(qc_jgt_a)
# subsetting geno
geno_f_sub = geno.filter_rows(geno.variant_qc.call_rate >= 0.99)
geno_f_sub = geno_f_sub.filter_rows(geno_f_sub.variant_qc.n_non_ref >= 9.97)
#### Kinship analysis
pc_rel = hl.pc_relate(geno_f_sub.GT, 0.001, k=2, statistics='kin')
pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)
geno = geno.filter_cols(hl.is_defined(related_samples_to_remove[geno.col_key]), keep=False)
geno = hl.variant_qc(geno)
geno = geno.filter_rows(geno.variant_qc.n_non_ref > 0)
print('number of variants after quality filtering')
print(geno.count_rows())
print('number of samples remaining after quality filtering')
print(geno.count_cols())
# correlation plots
geno = geno.annotate_rows(REX_AF = 1 - geno.variant_qc.AF[0])
geno.describe()
q = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_NFE_AF, xlabel='REX AF', ylabel='gnomAD NFE AF',
size=4)
show(q)
r = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_AFR_AF, xlabel='REX AF', ylabel='gnomAD AFR AF',
size=4)
show(r)
s = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_AMR_AF, xlabel='REX AF', ylabel='gnomAD AMR AF',
size=4)
show(s)
t = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_EAS_AF, xlabel='REX AF', ylabel='gnomAD EAS AF',
size=4)
show(t)
u = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_SAS_AF, xlabel='REX AF', ylabel='gnomAD SAS AF',
size=4)
show(u)
v = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_FIN_AF, xlabel='REX AF', ylabel='gnomAD FIN AF',
size=4)
show(v)
w = hl.plot.scatter(geno.REX_AF,
geno.info.gnomAD_AF, xlabel='REX AF', ylabel='gnomAD global AF',
size=4)
show(w)
geno.info.AN = geno.variant_qc.AN
hl.export_vcf(geno, '/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/geno_an.bgz')
reloading the vcf back after filtering, to extract p value
For overrepresentation analysis we need to do the following: 1) Select variants that are pathogenic or likely pathogenic (clinvar_significance is "p" or "l") 2) Calculate p-value based on binomial distribution. You'll need to annotate the rows of your matrixtable with a variable that you can call REX_AC or sth like it (it should be the total count of alternative allele at each position, like mt.variant_qc.AN - mt.variant_qc.AC[0]) The overrepresentation p-value should represent the binomial probability of sampling REX_AC allele in a sample of size mt.variant_qc.AN if the real allele frequency (success probability) was mt.info.gnomAD_AF
As far as I understand, you should be able to do a kind of vectorized operation with the mt if you do stuff in the annotat_rows
import scipy.stats as ss
import matplotlib.pyplot as plt
import re
import pandas as pd
from scipy.stats import binom
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels
import numpy as np
file = '/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/manually_parsed_corr_3.selected.vcf'
with open(file) as f:
#df = pd.DataFrame(columns=['gene', 'rsid', 'zygosity', 'clinvar', 'gnomad_AF', 'AC', 'AN', 'p_val'])
df = []
for line in f:
if line.startswith('#'):
continue
else:
clinvar_sig = re.findall('clinvar_significance=([^;]+?;)', line)[0].strip(';')
if clinvar_sig in ['pathogenic', 'likely_pathogenic', 'pathogenic&likely_pathogenic']:
g_AF = float(re.findall('gnomAD_NFE_AF=([^;]+)', line)[0]) # p_success
s_AC = sum([int(x) for x in re.findall('AC=([^;]+)', line)[0].split(',')]) # n_success
s_AN = int(re.findall('AN=([^;]+)', line)[0]) # sample_size
zygosity = re.findall('\t(1\/1)', line)
gene_n = re.findall('gene_name=([^;]+)', line)[0] # sample_size
rs = re.findall('\t(rs\d*)\t', line)
p_value = ss.binom.pmf(s_AC, s_AN, g_AF, loc=0)
df.append((gene_n, rs, zygosity, clinvar_sig, g_AF, s_AC, s_AN, p_value))
data = pd.DataFrame(df)
data.columns = ['gene_name', 'rs', 'zygosity', 'clinvar_sig', 'g_AF', 's_AC', 's_AN', 'p_value']
# I am converting zygosity column as a string and taking first element
data.zygosity = data.zygosity.str[0]
# this line is to replace any non homozygous samples represented as 0, basically every condition that is
# not homozygous is represented as NaN
data["zygosity"] = data["zygosity"].fillna(0)
# now we can filter the condition 1/1 i.e. homozygous
data = data[(data[['zygosity']] == '1/1').all(axis=1)]
We should have the following filters AC should be greater than 0 gnomAD_AF should also be greater than 0 p-value, in turn, should be less than 0.01
data = data[(data[['p_value']] <= 0.01).all(axis=1)]
data = data[(data[['g_AF']] <= 0.005).all(axis=1)]
data = data[(data[['s_AC']] != 0).all(axis=1)]
data = data[(data[['g_AF']] != 0).all(axis=1)]
data.rs = data.rs.str[0]
# Sort the rows of dataframe by column 'g_AF'
data = data.sort_values(by ='s_AC', ascending=False)
data
# loading omim tables
omim = pd.read_csv('/home/mrinalv/project/use_omim_table.txt', sep='\t')
omim.head()
# explode a dataframe column
def explode(df, lst_cols, fill_value='', preserve_index=False):
# make sure `lst_cols` is list-alike
if (lst_cols is not None
and len(lst_cols) > 0
and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
lst_cols = [lst_cols]
# all columns except `lst_cols`
idx_cols = df.columns.difference(lst_cols)
# calculate lengths of lists
lens = df[lst_cols[0]].str.len()
# preserve original index values
idx = np.repeat(df.index.values, lens)
# create "exploded" DF
res = (pd.DataFrame({
col:np.repeat(df[col].values, lens)
for col in idx_cols},
index=idx)
.assign(**{col:np.concatenate(df.loc[lens>0, col].values)
for col in lst_cols}))
# append those rows that have empty lists
if (lens == 0).any():
# at least one list in cells is empty
res = (res.append(df.loc[lens==0, idx_cols], sort=False)
.fillna(fill_value))
# revert the original index order
res = res.sort_index()
# reset index if requested
if not preserve_index:
res = res.reset_index(drop=True)
return res
omim['genes'] = omim['genes'].astype(str)
omim = pd.DataFrame(explode(omim.assign(genes=omim.genes.str.split('|')), 'genes'))
omim = omim.rename(columns={'genes': 'gene_name'})
omim.head()
new_data = pd.merge(data, omim, on="gene_name")
new_data.head()
new_data
#new_data.to_csv(r'/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep.csv', index = False)
!head /media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep.csv
# now we need only Autosomal Recessive ones
auto_res = new_data[new_data['phenotypeInheritance'].str.contains("Autosomal recessive")]
#auto_res.to_csv(r'/media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep_autores.csv', index = False)
!head /media/CERBADATA/vc_proj/all_outputs/joint_vcf/vep/overrep_autores.csv
auto_res
#dataframe for correlation plots
with open(file) as f:
#df = pd.DataFrame(columns = ['REX', 'g_AFR', 'g_AMR', 'g_EAS', 'g_SAS', 'g_NFE', 'g_FIN']
n_df_all = []
for line in f:
if line.startswith('#'):
continue
else:
clinvar_sig = re.findall('clinvar_significance=([^;]+?;)', line)[0].strip(';')
REX = sum([float(x) for x in re.findall(';AF=([^;]+)', line)[0].split(',')])
AFR = float(re.findall('gnomAD_AFR_AF=([^;]+)', line)[0])
AMR = float(re.findall('gnomAD_AMR_AF=([^;]+)', line)[0])
EAS = float(re.findall('gnomAD_EAS_AF=([^;]+)', line)[0])
SAS = float(re.findall('gnomAD_SAS_AF=([^;]+)', line)[0])
NFE = float(re.findall('gnomAD_NFE_AF=([^;]+)', line)[0])
FIN = float(re.findall('gnomAD_FIN_AF=([^;]+)', line)[0])
TOT = float(re.findall('gnomAD_AF=([^;]+)', line)[0])
n_df_all.append((REX, AFR, AMR, EAS, SAS, NFE, FIN, TOT))
data_4 = pd.DataFrame(n_df_all)
data_4.columns = ['REX', 'g_AFR', 'g_AMR', 'g_EAS', 'g_SAS', 'g_NFE', 'g_FIN', 'TOT']
data_4 = data_4[(data_4[['TOT']] != 0).all(axis=1)]
def heatMap(df, mirror):
# Create Correlation df
corr = df.corr()
# Plot figsize
fig, ax = plt.subplots(figsize=(10, 10))
# Generate Color Map
colormap = sns.diverging_palette(220, 10, as_cmap=True)
if mirror == True:
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
#Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
#show plot
else:
# Drop self-correlations
dropSelf = np.zeros_like(corr)
dropSelf[np.triu_indices_from(dropSelf)] = True
# Generate Color Map
colormap = sns.diverging_palette(220, 10, as_cmap=True)
# Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f", mask=dropSelf)
# Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
# Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
# show plot
plt.show()
heatMap(data_4, mirror=False)
def corrdot(*args, **kwargs):
corr_r = args[0].corr(args[1], 'pearson')
corr_text = f"{corr_r:2.2f}".replace("0.", ".")
ax = plt.gca()
ax.set_axis_off()
marker_size = abs(corr_r) * 10000
ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
vmin=-1, vmax=1, transform=ax.transAxes)
font_size = abs(corr_r) * 40 + 5
ax.annotate(corr_text, [.5, .5,], xycoords="axes fraction",
ha='center', va='center', fontsize=font_size)
sns.set(style='white', font_scale=1.6)
iris = data_4.sample(10000)
g = sns.PairGrid(iris, aspect=1.4, diag_sharey=False)
g.map_lower(sns.regplot, lowess=True, ci=False, line_kws={'color': 'black'})
g.map_diag(sns.distplot, kde_kws={'color': 'black'})
g.map_upper(corrdot)